import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
#pip install -U kaleido #for saving figures into png, jpeg format
data = pd.read_excel("../Data/socotecDB.xls")
data.head()
WARNING *** OLE2 inconsistency: SSCS size is 0 but SSAT size is non-zero
| ID | BILL_ID | SUPPLIER | OFFICE | UTILITY_LOCATION | UTILITY_CITY | UTILITY_REGION | SUPPLY_TYPE | MONTHLY_F1 | MONTHLY_F2 | MONTHLY_F3 | MC | MONTHLY_SMC | MONTH | COST_Kw_h | COST_MC | COST_SMC | MONTHLY_BILL_AMOUNT | RENEWABLE_SOURCE_PERCENTUAL | NOTES | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 7026491105 | A2A ENERGIA SPA | ALTRO | VIA EDISON TOMMASO 045 - 20026 NOVATE MILANESE MI | MI | LOMBARDIA | METANO | 0.0 | 0.0 | 0.0 | 0.0 | 336.78 | GENNAIO | 0.0 | 0.0 | 0.65 | 218.0 | 0.0 | 0 |
| 1 | 2 | 7026491105 | A2A ENERGIA SPA | ALTRO | VIA EDISON TOMMASO 045 - 20026 NOVATE MILANESE MI | MI | LOMBARDIA | METANO | 0.0 | 0.0 | 0.0 | 0.0 | 202.47 | FEBBRAIO | 0.0 | 0.0 | 0.66 | 134.0 | 0.0 | 0 |
| 2 | 3 | 7026491105 | A2A ENERGIA SPA | ALTRO | VIA EDISON TOMMASO 045 - 20026 NOVATE MILANESE MI | MI | LOMBARDIA | METANO | NaN | NaN | NaN | 0.0 | 132.31 | MARZO | 0.0 | 0.0 | 0.81 | 107.0 | 0.0 | 0 |
| 3 | 4 | 7026491105 | A2A ENERGIA SPA | ALTRO | VIA EDISON TOMMASO 045 - 20026 NOVATE MILANESE MI | MI | LOMBARDIA | METANO | 0.0 | 0.0 | 0.0 | 0.0 | 71.16 | APRILE | 0.0 | 0.0 | 0.82 | 58.0 | 0.0 | 0 |
| 4 | 5 | 7026491105 | A2A ENERGIA SPA | ALTRO | VIA EDISON TOMMASO 045 - 20026 NOVATE MILANESE MI | MI | LOMBARDIA | METANO | 0.0 | 0.0 | 0.0 | 0.0 | 23.05 | MAGGIO | 0.0 | 0.0 | 1.47 | 34.0 | 0.0 | 0 |
data.shape
(578, 20)
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 578 entries, 0 to 577 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 578 non-null int64 1 BILL_ID 578 non-null object 2 SUPPLIER 578 non-null object 3 OFFICE 578 non-null object 4 UTILITY_LOCATION 578 non-null object 5 UTILITY_CITY 578 non-null object 6 UTILITY_REGION 578 non-null object 7 SUPPLY_TYPE 577 non-null object 8 MONTHLY_F1 576 non-null float64 9 MONTHLY_F2 576 non-null float64 10 MONTHLY_F3 575 non-null float64 11 MC 573 non-null float64 12 MONTHLY_SMC 578 non-null float64 13 MONTH 578 non-null object 14 COST_Kw_h 576 non-null float64 15 COST_MC 571 non-null float64 16 COST_SMC 577 non-null float64 17 MONTHLY_BILL_AMOUNT 576 non-null float64 18 RENEWABLE_SOURCE_PERCENTUAL 567 non-null float64 19 NOTES 575 non-null object dtypes: float64(10), int64(1), object(9) memory usage: 90.4+ KB
data.isna().any()
ID False BILL_ID False SUPPLIER False OFFICE False UTILITY_LOCATION False UTILITY_CITY False UTILITY_REGION False SUPPLY_TYPE True MONTHLY_F1 True MONTHLY_F2 True MONTHLY_F3 True MC True MONTHLY_SMC False MONTH False COST_Kw_h True COST_MC True COST_SMC True MONTHLY_BILL_AMOUNT True RENEWABLE_SOURCE_PERCENTUAL True NOTES True dtype: bool
data["MONTH"].unique()
array(['GENNAIO', 'FEBBRAIO', 'MARZO', 'APRILE', 'MAGGIO', 'GIUGNO',
'LUGLIO', 'OTTOBRE', 'NOVEMBRE', 'DICEMBRE', 'SETTEMBRE', 'AGOSTO'],
dtype=object)
#Datetime correction = rename all the month and change the type of this column
data["MONTH"] = data["MONTH"].replace({"GENNAIO":"January",
"FEBBRAIO": "February",
"MARZO": "March",
"APRILE": "April",
"MAGGIO": "May",
"GIUGNO": "June",
"LUGLIO": "July",
"AGOSTO": "August",
"SETTEMBRE": "September",
"OTTOBRE": "October",
"NOVEMBRE": "November",
"DICEMBRE": "December"})
data["MONTH"] = pd.to_datetime("2021-" + data["MONTH"],format="%Y-%B")
data.info() #Check the type of [MONTH]
<class 'pandas.core.frame.DataFrame'> RangeIndex: 578 entries, 0 to 577 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 578 non-null int64 1 BILL_ID 578 non-null object 2 SUPPLIER 578 non-null object 3 OFFICE 578 non-null object 4 UTILITY_LOCATION 578 non-null object 5 UTILITY_CITY 578 non-null object 6 UTILITY_REGION 578 non-null object 7 SUPPLY_TYPE 577 non-null object 8 MONTHLY_F1 576 non-null float64 9 MONTHLY_F2 576 non-null float64 10 MONTHLY_F3 575 non-null float64 11 MC 573 non-null float64 12 MONTHLY_SMC 578 non-null float64 13 MONTH 578 non-null datetime64[ns] 14 COST_Kw_h 576 non-null float64 15 COST_MC 571 non-null float64 16 COST_SMC 577 non-null float64 17 MONTHLY_BILL_AMOUNT 576 non-null float64 18 RENEWABLE_SOURCE_PERCENTUAL 567 non-null float64 19 NOTES 575 non-null object dtypes: datetime64[ns](1), float64(10), int64(1), object(8) memory usage: 90.4+ KB
#Create a graph depending of timeline
monthly_sms = data.groupby(["MONTH"])["MONTHLY_SMC"].sum()
tim1 = px.bar(x=monthly_sms.index, y=monthly_sms.values)
tim1.update_layout(xaxis_tickformat="%b")
tim1.update_xaxes(title_text="Month", nticks=12)
tim1.update_yaxes(title_text="Monthly consumption of CO2, m3" )
tim1.show()
#tim1.write_image("offices_timebase1.jpeg")
tim1.write_image("offices_timebase1.png")
#NAN values count in a montly consumption
MONTHLY_F1_nan = data["MONTHLY_F1"].isnull().sum()
MONTHLY_F1_nan
2
MC_nan = data["MC"].isnull().sum()
MC_nan
5
MONTHLY_SMC_nan = data["MONTHLY_SMC"].isnull().sum()
MONTHLY_SMC_nan
0
#RENEWABLE_SOURCE_PERCENTUAL analysis
data[data["RENEWABLE_SOURCE_PERCENTUAL"] > 0].count()
ID 12 BILL_ID 12 SUPPLIER 12 OFFICE 12 UTILITY_LOCATION 12 UTILITY_CITY 12 UTILITY_REGION 12 SUPPLY_TYPE 12 MONTHLY_F1 12 MONTHLY_F2 12 MONTHLY_F3 12 MC 12 MONTHLY_SMC 12 MONTH 12 COST_Kw_h 12 COST_MC 12 COST_SMC 12 MONTHLY_BILL_AMOUNT 12 RENEWABLE_SOURCE_PERCENTUAL 12 NOTES 12 dtype: int64
data["OFFICE"].unique()
array(['ALTRO', 'MILANO', 'GENOVA', 'AVELLINO', 'SENIGALLIA', 'FERRARA',
'ORTONA'], dtype=object)
data["RENEWABLE_SOURCE_PERCENTUAL"].unique()
#Basically only 12 resourses where using renewable source of energy
array([0. , 0.45, nan, 1. ])
ren1 = px.pie(data, values="RENEWABLE_SOURCE_PERCENTUAL", names="OFFICE", title="Renewable sources percentage")
ren1.show()
ren1.write_image("office_renewable.png")
#SUPPLIER and SUPPLY_TYPE analysis
data["SUPPLIER"].unique()
array(['A2A ENERGIA SPA', 'STADTISCHES ELEKTRIZITATSWERK STERZING',
'GREEN NETWORK SPA', 'AGN/GPL', 'ASCOTRADE', 'AXPO ITALIA S.p.A.',
'DUFERCO ENERGIA SPA', 'ENI GAS E LUCE SPA', 'GOLDENERGY s.r.l.',
'HERA S.p.A.', 'FORNITORE', 'IREN MERCATO S.p.A.', 'SIDIGAS SRL',
'SICME ENERGY E GAS SRL',
'SERVIZIO ELETTRICO NAZIONALE - SERVIZIO DI MAGGIOR TUTELA',
'ELETTRAGAS S.R.L.', '#NAME?', 'COBEGAS SRL',
'ROMA GAS & POWER SRL', 'ZELO'], dtype=object)
data["SUPPLY_TYPE"].unique()
array(['METANO', 'ENERGIA', 'GPL', 'ACQUA', nan, 'ALTRO GAS'],
dtype=object)
#s.rename({1: 3, 2: 5})
#supply = pd.Series(data["SUPPLY_TYPE"])
data["SUPPLY_TYPE"] = data["SUPPLY_TYPE"].replace({"METANO": "METHANE", "ENERGIA": "ENERGY", "GPL": "LPG",
"ACQUA": "WATER", "ALTRO GAS": "OTHER GAS"})
data["SUPPLY_TYPE"].value_counts() #GPL = Liquefied petroleum gas
ENERGY 301 METHANE 214 WATER 35 LPG 15 OTHER GAS 12 Name: SUPPLY_TYPE, dtype: int64
#Group the offices, supplier, supply_type and count how many of them are there
group_sup = data.groupby(["OFFICE", "SUPPLY_TYPE", "SUPPLIER"], as_index=False).count()
group_sup.head()
| OFFICE | SUPPLY_TYPE | SUPPLIER | ID | BILL_ID | UTILITY_LOCATION | UTILITY_CITY | UTILITY_REGION | MONTHLY_F1 | MONTHLY_F2 | MONTHLY_F3 | MC | MONTHLY_SMC | MONTH | COST_Kw_h | COST_MC | COST_SMC | MONTHLY_BILL_AMOUNT | RENEWABLE_SOURCE_PERCENTUAL | NOTES | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ALTRO | ENERGY | #NAME? | 9 | 9 | 9 | 9 | 9 | 9 | 9 | 9 | 9 | 9 | 9 | 9 | 9 | 9 | 9 | 9 | 9 |
| 1 | ALTRO | ENERGY | A2A ENERGIA SPA | 21 | 21 | 21 | 21 | 21 | 21 | 21 | 21 | 21 | 21 | 21 | 21 | 21 | 21 | 21 | 21 | 21 |
| 2 | ALTRO | ENERGY | AXPO ITALIA S.p.A. | 43 | 43 | 43 | 43 | 43 | 43 | 43 | 42 | 43 | 43 | 43 | 43 | 43 | 43 | 43 | 43 | 43 |
| 3 | ALTRO | ENERGY | DUFERCO ENERGIA SPA | 62 | 62 | 62 | 62 | 62 | 62 | 62 | 62 | 58 | 62 | 62 | 62 | 62 | 62 | 61 | 58 | 59 |
| 4 | ALTRO | ENERGY | GOLDENERGY s.r.l. | 5 | 5 | 5 | 5 | 5 | 5 | 5 | 5 | 5 | 5 | 5 | 5 | 5 | 5 | 5 | 5 | 5 |
#Visualisation of each supplier
sup1 = px.sunburst(group_sup, path=["OFFICE", "SUPPLY_TYPE", "SUPPLIER"], values=group_sup["ID"])
sup1.show()
sup1.write_image("office_sunburst_supplier.png")
/Users/olhafedyshyn/opt/anaconda3/lib/python3.9/site-packages/plotly/express/_core.py:1637: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. /Users/olhafedyshyn/opt/anaconda3/lib/python3.9/site-packages/plotly/express/_core.py:1637: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. /Users/olhafedyshyn/opt/anaconda3/lib/python3.9/site-packages/plotly/express/_core.py:1637: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
#Sum the data by offices
data_group3 = data.groupby(by=["OFFICE"]).sum()
data_group3
| ID | MONTHLY_F1 | MONTHLY_F2 | MONTHLY_F3 | MC | MONTHLY_SMC | COST_Kw_h | COST_MC | COST_SMC | MONTHLY_BILL_AMOUNT | RENEWABLE_SOURCE_PERCENTUAL | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| OFFICE | |||||||||||
| ALTRO | 90763 | 16329.88 | 14140.81 | 17567.54 | 4.0 | 6951.90 | 244.02 | 0.00 | 276.74 | 27128.03 | 1.00 |
| AVELLINO | 12987 | 67235.00 | 15846.00 | 24098.00 | 711.5 | 0.00 | 4.80 | 19.75 | 0.00 | 32805.61 | 0.00 |
| FERRARA | 36773 | 163033.23 | 55019.79 | 97396.95 | 3611.0 | 29575.00 | 7.10 | 0.00 | 15.28 | 117034.19 | 0.00 |
| GENOVA | 5145 | 13683.00 | 4881.00 | 7472.00 | 0.0 | 2119.00 | 5.55 | 0.00 | 67.01 | 13434.99 | 4.50 |
| MILANO | 1257 | 148620.00 | 45667.00 | 71554.00 | 0.0 | 28867.00 | 2.20 | 0.00 | 9.36 | 70456.00 | 0.45 |
| ORTONA | 11928 | 93896.60 | 33802.78 | 60093.82 | 0.0 | 766.91 | 0.00 | 0.00 | 3.90 | 801.25 | 0.00 |
| SENIGALLIA | 8478 | 8281.94 | 4464.47 | 7755.29 | 0.0 | 6006.43 | 4.98 | 0.00 | 13.86 | 13271.06 | 0.00 |
fig5 = go.Figure(data=[
go.Bar(name="MONTLY_SMC", x=data_group3.index, y=data_group3["MONTHLY_SMC"], yaxis="y", offsetgroup=1),
go.Bar(name="COST_SMC", x=data_group3.index, y=data_group3["COST_SMC"], yaxis="y2", offsetgroup=2)],
layout={
'yaxis': {'title': 'Consumption of gas, m3'},
'yaxis2': {'title': 'Cost, € per unit', 'overlaying': 'y', 'side': 'right'}
})
fig5.update_layout(barmode="group", xaxis_title="Office")
fig5.show()
fig8 = go.Figure(data=[
go.Bar(name="MC", x=data_group3.index, y=data_group3["MC"], yaxis="y", offsetgroup=1),
go.Bar(name="COST_MC", x=data_group3.index, y=data_group3["COST_MC"], yaxis="y2", offsetgroup=2)],
layout={
'yaxis': {'title': 'Consumption of water, m3'},
'yaxis2': {'title': 'Cost, € per unit', 'overlaying': 'y', 'side': 'right'}
})
fig8.update_layout(barmode="group", xaxis_title="Office")
fig8.show()
fig7 = go.Figure(data=[
go.Bar(name="MONTHLY_F1", x=data_group3.index, y=data_group3["MONTHLY_F1"]),
go.Bar(name="MONTHLY_F2", x=data_group3.index, y=data_group3["MONTHLY_F2"]),
go.Bar(name="MONTHLY_F3", x=data_group3.index, y=data_group3["MONTHLY_F3"])])
fig7.update_layout(barmode="group", xaxis_title="Office", yaxis_title=f"Electricity consumption <br> in different part of the month, Kw/h")
fig7.show()
fig7.write_image("office_monthly_consumption.png")